#Authors: [INSERT NAMES HERE]
#Author Date:
#Purpose: The purpose of this notebook is to house all data set transformation, cleansing, visualization, statistical analysis, and note-taking for the 2025 CU Athletic Department Sports Science Internship Program
#LAST UPDATED:
#Including helpful libraries
library(tidyverse)
library(readxl)
library(aod)
library(gt)
library(boot)
library(mgcv)
#loading in the Catapult data to look at sprinting values
Catapult_Session <- read_csv("data-sets/data-sets-uncompressed/data-sets-compressed/Running Imbalance and Speed/Catapult Session - Outdoor FB.csv")
#loading in the Historical Running data to look at running imbalance values
Historical_Running <- read_csv("data-sets/data-sets-uncompressed/data-sets-compressed/Running Imbalance and Speed/Compiled Historical Running Imbalance FB.csv")
#loading in the Incident Report to look at HSIs
Incident_Report <- read_csv("data-sets/data-sets-uncompressed/data-sets-compressed/Running Imbalance and Speed/Incident Report FB IDs.csv")
Catapult_Session_clean <- Catapult_Session %>%
#putting the date as a date class
mutate(Date = as.Date(Date, "%m/%d/%Y")) %>%
#only selecting important columns for this analysis
select(anon_id, Date, Age, Primary.Position, Total.Distance, Period.Name, Total.Duration..min., Velocity.Band.1.Total.Distance, Velocity.Band.2.Total.Distance, Velocity.Band.3.Total.Distance, Velocity.Band.4.Total.Distance, Velocity.Band.5.Total.Distance, Velocity.Band.6.Total.Distance, Velocity.Band.7.Total.Distance, Velocity.Band.8.Total.Distance, Velocity.Band.2.Total.Effort.Count, Velocity.Band.3.Total.Effort.Count, Velocity.Band.4.Total.Effort.Count, Velocity.Band.5.Total.Effort.Count, Velocity.Band.6.Total.Effort.Count, Velocity.Band.7.Total.Effort.Count, Velocity.Band.8.Total.Effort.Count, Maximum.Velocity, Average.Velocity, Hit.90.Percent.Max, Date.of.Last.90.Effort, Days.Since.Last.90.Effort, Hit.Max.Velocity., Date.of.All.Time.Max.Velocity, Days.Since.Max.Velocity) %>%
#calculating each player's maximum velocity
group_by(anon_id) %>%
mutate(Player.Max.Velocity = max(na.omit(Maximum.Velocity))) %>%
ungroup() %>%
#only selecting data from January 1, 2024 and on
filter(Date >= "2024-01-01")
head(Catapult_Session_clean)
Historical_Running_clean <- Historical_Running %>%
#taking out rows that don't have data
filter(Running.Imbalance != "n/a") %>%
#putting running imbalance as a number and converting the date to a date class
mutate(Running.Imbalance = as.numeric(Running.Imbalance),
Date = as.Date(Date, "%m/%d/%Y")) %>%
#only using data from January 1, 2024 and on
filter(Date >= "2024-01-01") %>%
mutate(X=1:4063) %>%
#making days January 1, 2024
group_by(anon_id) %>%
mutate(Days.Since.Start = as.numeric(Date - min(Date))) %>%
ungroup()
head(Historical_Running_clean)
Incident_Report_clean <- Incident_Report %>%
#filtering for only hamstring injuries
filter(OSICS14.Code == "TM1",
Status != "Full Go") %>%
#getting the date of the injury as a date class
mutate(Date = as.Date(Date, "%m/%d/%Y"),
Date.of.Injury = as.Date(Date.of.Injury...Onset.of.symptoms, "%m/%d/%Y"),
Examination.Date = as.Date(Examination.Date, "%m/%d/%Y")) %>%
#only selecting relevant columns for this analysis
select(anon_id, Position, Date, Date.of.Injury, Time.of.Injury, Side, OSICS.Injury.Diagnosis, Coach.s.Diagnosis, Recurrence.of.Injury, Choose.Season, Onset.of.Symptoms, Injury.Prognosis, General.Mechanism, Specific.Mechanism, Injured.While., Type.of.Event, Season., Status, Days.in.Status) %>%
#making days out due to injury for each player and each injury they sustained
group_by(anon_id, Date.of.Injury) %>%
mutate(Days.Out = sum(Days.in.Status)) %>%
ungroup()
head(Incident_Report_clean)
#taking the IDs of players who are and aren't injured
all_IDs <- unique(Historical_Running_clean$anon_id)
#taking IDs that were injured and also have running imbalance data
injured_IDs <- intersect(unique(Incident_Report_clean$anon_id), all_IDs)
#taking all players with running imbalance data that don't have an injury
uninjured_IDs <- unique((Historical_Running_clean %>%
filter(!anon_id %in% injured_IDs))$anon_id)
#injured players who also have running imbalance data
Incident_Report_clean <- Incident_Report_clean %>%
filter(anon_id %in% injured_IDs)
#all players that only have running imbalance data or have both running imbalance data and incidence report
Historical_Running_clean <- Historical_Running_clean %>%
filter(anon_id %in% injured_IDs | anon_id %in% uninjured_IDs)
#removing uncleaned data sets
remove(Incident_Report)
remove(Historical_Running)
remove(Catapult_Session)
# Bar chart for how often each player reaches ≥ 90% maximum velocity
# Count for how many times each anon_id hit ≥ 90% maximum velocity
hit_90_counts <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>% # Filter for training season
filter(Hit.90.Percent.Max == "Yes") %>%
distinct(anon_id,Date) %>%
group_by(anon_id) %>%
summarise(times_hit_90 = n())
# Plot of all players' frequencies
ggplot(hit_90_counts, aes(x = anon_id, y = times_hit_90)) +
geom_bar(stat = "identity", fill = "#CFB87C") +
geom_hline(yintercept = mean(hit_90_counts$times_hit_90),
linetype = "dashed", color = "#565A5C", linewidth = 0.5) +
labs(title = "Player Counts for Achieving ≥ 90% of Maximum Velocity During 2024–25 Season",
subtitle = paste("Team Average:", round(mean(hit_90_counts$times_hit_90), 1)),
x = "Athlete ID", y = "Times ≥ 90%") +
theme_classic() +
theme(axis.text.x = element_text(size = 6, angle = 90))
hit_90_counts <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01"),
Hit.90.Percent.Max == "Yes") %>%
distinct(anon_id, Date, .keep_all = TRUE) %>% # Keep position data
group_by(anon_id, Primary.Position) %>% # Group by both ID and position
summarise(times_hit_90 = n(), .groups = "drop")
QBs <- hit_90_counts %>%
filter(Primary.Position == "QB")
# Calculate the averages first
overall_avg <- mean(hit_90_counts$times_hit_90)
qb_avg <- mean(QBs$times_hit_90)
ggplot(QBs, aes(x=anon_id, y=times_hit_90)) +
geom_bar(stat="identity", fill = "#CFB87C") +
geom_text(aes(label = times_hit_90),
vjust = -0.5,
size = 3.5) +
geom_hline(yintercept = overall_avg,
linetype = "dashed", color = "#000000") +
geom_hline(yintercept = qb_avg,
linetype = "dashed", color = "#565A5C") +
annotate("text", x = 1, y = overall_avg + 0.5,
label = "Team Avg", color = "#000000", size = 3) +
annotate("text", x = 1, y = qb_avg + 0.5,
label = "QB Avg", color = "#565A5C", size = 3) +
labs(title = "QB counts for reaching > 90% max velocity",
subtitle = paste("QB Average:", qb_avg)) +
theme_classic()
# Facet Plot
# Recalculate hit_90_counts for all positions
hit_90_counts <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01"),
Hit.90.Percent.Max == "Yes") %>%
distinct(anon_id, Date, .keep_all = TRUE) %>% # Keep position data
group_by(anon_id, Primary.Position) %>% # Group by both ID and position
summarise(times_hit_90 = n(), .groups = "drop")
# Calculate overall average
overall_avg <- mean(hit_90_counts$times_hit_90)
# Plot faceted bar charts by position
ggplot(hit_90_counts, aes(x = anon_id, y = times_hit_90)) +
geom_bar(stat = "identity", fill = "#CFB87C") +
geom_text(aes(label = times_hit_90), vjust = -0.5, size = 3.5) +
geom_hline(yintercept = overall_avg, linetype = "dashed", color = "#000000") +
annotate("text", x = 1, y = overall_avg + 0.5,
label = "Team Avg", color = "#000000", size = 3, hjust = 0) +
facet_wrap(~ Primary.Position, scales = "free_x") +
labs(title = "Counts of >90% Max Velocity by Player and Position",
y = "Times > 90% Max Velocity",
x = "Player (anon_id)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plots for each position
hit_90_counts <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01"),
Hit.90.Percent.Max == "Yes") %>%
distinct(anon_id, Date, .keep_all = TRUE) %>%
group_by(anon_id, Primary.Position) %>%
summarise(times_hit_90 = n(), .groups = "drop")
# Get the overall team average once from hit_90_counts
team_avg <- mean(hit_90_counts$times_hit_90)
# Define positions to loop through
positions <- unique(hit_90_counts$Primary.Position)
# Plotting function:
plot_hit_90_by_position <- function(pos) {
position_data <- hit_90_counts %>%
filter(Primary.Position == pos)
pos_avg <- mean(position_data$times_hit_90)
ggplot(position_data, aes(x = anon_id, y = times_hit_90)) +
geom_bar(stat = "identity", fill = "#CFB87C") +
geom_text(aes(label = times_hit_90), vjust = -0.5, size = 3.5) +
geom_hline(yintercept = pos_avg, linetype = "dashed", color = "#565A5C") +
annotate("text", x = 1, y = pos_avg + 0.5,
label = "Pos Avg", color = "#565A5C", size = 3, hjust = 0) +
geom_hline(yintercept = team_avg, linetype = "dashed", color = "#000000") +
annotate("text", x = 1, y = team_avg + 0.5,
label = "Team Avg", color = "#000000", size = 3, hjust = 0) +
labs(title = paste("Times ≥90% Max Velocity –", pos),
subtitle = paste0("Position Avg: ", round(pos_avg, 1),
" | Team Avg: ", round(team_avg, 1)),
y = "Count",
x = "anon_id") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
# Loop through each position and print the plot
plots_by_position <- lapply(positions, function(pos) {
print(plot_hit_90_by_position(pos))
})
# Table for average values of each position
position_averages <- hit_90_counts %>%
group_by(Primary.Position) %>%
summarise(avg_times_hit_90 = mean(times_hit_90), .groups = "drop") %>%
arrange(desc(avg_times_hit_90))
position_averages_with_team <- bind_rows(
tibble(
Primary.Position = "Team Average",
avg_times_hit_90 = team_avg
),
position_averages
)
position_averages_with_team %>%
gt() %>%
tab_header(
title = "Average Times >90% Max Velocity by Position and Team"
)
# Create weekly sprint count totals
# Create a week variable
Catapult_Session_clean <- Catapult_Session_clean %>%
mutate(week = floor_date(Date, unit = "week"))
# Get unique athletes and all weeks in your date range
all_athletes <- unique(Catapult_Session_clean$anon_id)
all_weeks <- seq.Date(as.Date("2024-06-30"), as.Date("2025-07-01"), by = "week")
# Create a full grid of anon_id and weeks (this includes weeks with 0 sprints)
full_grid <- expand_grid(anon_id = all_athletes, week = all_weeks)
# Count sprints per athlete-week
sprint_counts <- Catapult_Session_clean %>%
filter(Hit.90.Percent.Max == "Yes") %>%
distinct(anon_id, Date, .keep_all = TRUE) %>%
group_by(anon_id, week) %>%
summarise(sprint_count = n(), .groups = "drop")
# Join full grid with actual sprint counts and replace NAs with 0s
weekly_sprint_counts <- full_grid %>%
left_join(sprint_counts, by = c("anon_id", "week")) %>%
mutate(sprint_count = replace_na(sprint_count, 0))
# Categorize sprint load groups
weekly_sprint_counts <- weekly_sprint_counts %>%
mutate(sprint_load_group = case_when(
sprint_count <= 1 ~ "Low (≤1 sprint)",
sprint_count %in% 2:3 ~ "Moderate (2–3 sprints)",
sprint_count %in% 4:5 ~ "High (4–5 sprints)"
))
# Merge weekly sprints and injury data frames
# Create an injury weeks data frame from incident report
# This should indicate which week the injury occurred
injury_weeks <- Incident_Report_clean %>%
mutate(
week = floor_date(as.Date(Date.of.Injury), unit = "week")
) %>%
filter(Date.of.Injury >= as.Date("2024-06-30") & Date.of.Injury <= as.Date("2025-07-01")) %>%
select(anon_id, week) %>%
distinct() %>%
mutate(injury = 1)
# Merge with weekly sprints data frame
weekly_sprint_counts <- weekly_sprint_counts %>%
left_join(injury_weeks, by = c("anon_id", "week")) %>%
mutate(injury = ifelse(is.na(injury), 0, injury)) # Fill NAs with 0 (no injury)
# Table to compare injury weeks to non injury weeks
weekly_sprint_counts %>%
group_by(injury) %>%
summarise(
mean_sprints = mean(sprint_count, na.rm = TRUE),
sd_sprints = sd(sprint_count, na.rm = TRUE),
n = n()
)
ggplot(weekly_sprint_counts, aes(x = week, y = sprint_count, color = factor(injury))) +
geom_line(aes(group = anon_id), alpha = 0.5) +
geom_point(data = subset(weekly_sprint_counts, injury == 1), size = 2, shape = 21, fill = "red") +
scale_color_manual(values = c("0" = "grey", "1" = "red"), labels = c("No Injury", "Injury")) +
labs(title = "Weekly Sprint Counts with Injury Events", x = "Week", y = "Sprint Count", color = "Injury") +
theme_minimal()
No injuries occurred when an athlete had 4 or 5 sprint counts within a week, however there are limited data points in these sprint counts. There are also only three injuries that occur in the moderate range of 2-3 sprint counts The most injuries occur in the low range of 0-1 sprints.
This suggests that underexposure to sprinting may increase injury risk.
injury_by_group <- weekly_sprint_counts %>%
group_by(sprint_load_group) %>%
summarise(
total_weeks = n(),
injuries = sum(injury),
injury_rate = injuries / total_weeks
)
injury_by_group
The moderate group (2-3 sprints) has the highest injury rate (1.18%). Low sprints still had injuries, but at at lower rate (0.36%). High sprints had zero injuries, but the sample size is very small so we cannot draw a reliable conclusion from this group.
# Build contingency table
sprint_injury_table <- matrix(
c(0, 13, 3, 254, 22, 6093),
nrow = 3,
byrow = TRUE,
dimnames = list(
sprint_group = c("High", "Moderate", "Low"),
injury = c("Injury", "No Injury")
)
)
# Run Fisher's Exact Test
fisher.test(sprint_injury_table)
Fisher's Exact Test for Count Data
data: sprint_injury_table
p-value = 0.1235
alternative hypothesis: two.sided
Null hypothesis: Injury occurrence is independent of sprint load group. Alternative: Injury occurrence is associated with sprint load group.
Because the p-value is > 0.05, we fail to reject the null. there is no strong evidence of a significant association between sprinting effort group and injury occurrence based the current data.
Now we will explore the previous week’s sprint count.
# Create data set to explore the sprint count for the week before
# Shift sprint counts by 1 week for each athlete
weekly_sprint_counts_lagged <- weekly_sprint_counts %>%
arrange(anon_id, week) %>%
group_by(anon_id) %>%
mutate(
sprint_count_lag1 = lag(sprint_count),
sprint_load_group_lag1 = lag(sprint_load_group)
) %>%
ungroup()
# Create injury weeks again (same as before)
injury_weeks <- Incident_Report_clean %>%
mutate(
week = floor_date(as.Date(Date.of.Injury), unit = "week")
) %>%
filter(Date.of.Injury >= as.Date("2024-06-30") & Date.of.Injury <= as.Date("2025-07-01")) %>%
select(anon_id, week) %>%
distinct() %>%
mutate(injury = 1)
# Merge with sprint data using CURRENT week for injury
sprint_prior_to_injury <- weekly_sprint_counts_lagged %>%
left_join(injury_weeks, by = c("anon_id", "week")) %>%
mutate(injury = replace_na(injury, 0)) # Fill missing with 0s
Error in `mutate()`:
ℹ In argument: `injury = replace_na(injury, 0)`.
Caused by error:
! object 'injury' not found
Run `]8;;x-r-run:rlang::last_trace()rlang::last_trace()]8;;` to see where the error occurred.
# Join to add week number per athlete
weekly_sprint_counts <- weekly_sprint_counts %>%
group_by(anon_id) %>%
arrange(week) %>%
mutate(week_number = row_number()) %>%
ungroup()
# Identify weeks just before injury
pre_injury_weeks <- weekly_sprint_counts %>%
group_by(anon_id) %>%
mutate(
next_injury = lead(injury),
is_pre_injury = ifelse(next_injury == 1, 1, 0)
) %>%
ungroup() %>%
filter(is_pre_injury == 1)
# Compare average sprint counts
pre_injury_weeks %>%
summarise(mean_pre_injury_sprints = mean(sprint_count, na.rm = TRUE))
weekly_sprint_counts %>%
filter(injury == 0) %>%
summarise(mean_non_injury_sprints = mean(sprint_count, na.rm = TRUE))
NA
ggplot(weekly_sprint_counts, aes(x = factor(injury), y = sprint_count, fill = factor(injury))) +
geom_boxplot() +
scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon")) +
labs(title = "Sprint Count Distribution by Injury Status", x = "Injury", y = "Sprint Count") +
theme_minimal()
# Create sum of weekly band totals
# Make sure week is defined
Catapult_Session_clean <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
mutate(week = floor_date(Date, unit = "week"))
# Sum efforts by week and athlete
weekly_velocity_efforts <- Catapult_Session_clean %>%
group_by(anon_id, week) %>%
summarise(
V2 = sum(Velocity.Band.2.Total.Effort.Count, na.rm = TRUE),
V3 = sum(Velocity.Band.3.Total.Effort.Count, na.rm = TRUE),
V4 = sum(Velocity.Band.4.Total.Effort.Count, na.rm = TRUE),
V5 = sum(Velocity.Band.5.Total.Effort.Count, na.rm = TRUE),
V6 = sum(Velocity.Band.6.Total.Effort.Count, na.rm = TRUE),
V7 = sum(Velocity.Band.7.Total.Effort.Count, na.rm = TRUE),
V8 = sum(Velocity.Band.8.Total.Effort.Count, na.rm = TRUE),
total_efforts = V2 + V3 + V4 + V5 + V6 + V7 + V8,
Weekly_Max_Velocity = max(Maximum.Velocity, na.rm = TRUE),
Player_Max_Velocity = first(Player.Max.Velocity),
.groups = "drop"
) %>%
mutate(
pct_of_max_velocity = (Weekly_Max_Velocity / Player_Max_Velocity) * 100
)
# Create percentage of weekly efforts in each band
weekly_velocity_efforts <- weekly_velocity_efforts %>%
mutate(
pct_V2 = (V2 / total_efforts) * 100,
pct_V3 = (V3 / total_efforts) * 100,
pct_V4 = (V4 / total_efforts) * 100,
pct_V5 = (V5 / total_efforts) * 100,
pct_V6 = (V6 / total_efforts) * 100,
pct_V7 = (V7 / total_efforts) * 100,
pct_V8 = (V8 / total_efforts) * 100
)
# Plot for one player
# Identify player
player_id <- "ID_93"
# Filter to that player and reshape
player_weekly_data <- weekly_velocity_efforts %>%
filter(anon_id == player_id) %>%
select(week, pct_V2:pct_V8, pct_of_max_velocity) %>%
# Reshape to long format for plotting
pivot_longer(
cols = starts_with("pct_V"),
names_to = "velocity_band",
values_to = "percent_effort"
) %>%
mutate(
velocity_band = factor(velocity_band, levels = paste0("pct_V", 2:8),
labels = paste0("V", 2:8))
)
ggplot(player_weekly_data, aes(x = week)) +
geom_col(aes(y = percent_effort, fill = velocity_band), position = "stack") +
geom_line(aes(y = pct_of_max_velocity, group = 1),
color = "black",
size = 1.2) +
geom_point(aes(y = pct_of_max_velocity),
color = "black",
size = 2) +
scale_fill_brewer(palette = "Set2") +
labs(
title = paste("Velocity Band Effort % and Speed Trend for", player_id),
x = "Week",
y = "Percent of Total Weekly Effort",
fill = "Velocity Band"
) +
scale_y_continuous(
name = "Percent of Total Effort",
sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
) +
theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
# Plot for each player
# Loop over all players and display their plots
unique(weekly_velocity_efforts$anon_id) %>%
lapply(function(player_id) {
# Prepare the data for one player
player_weekly_data <- weekly_velocity_efforts %>%
filter(anon_id == player_id) %>%
select(week, pct_V2:pct_V8, pct_of_max_velocity) %>%
pivot_longer(
cols = starts_with("pct_V"),
names_to = "velocity_band",
values_to = "percent_effort"
) %>%
mutate(
velocity_band = factor(
velocity_band,
levels = paste0("pct_V", 2:8),
labels = paste0("V", 2:8)
)
)
# Skip empty data
if (nrow(player_weekly_data) == 0) return(NULL)
# Generate and print the plot
plot <- ggplot(player_weekly_data, aes(x = week)) +
geom_col(aes(y = percent_effort, fill = velocity_band), position = "stack") +
geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
scale_fill_brewer(palette = "Set2") +
scale_y_continuous(
name = "Percent of Total Effort",
sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
) +
labs(
title = paste("Velocity Band Effort % and Speed Trend for Player", player_id),
x = "Week",
fill = "Velocity Band"
) +
theme_classic()
print(plot) # Display plot
return(NULL)
})
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 28 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 21 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 28 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 28 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 21 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 21 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 35 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 35 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 21 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 28 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 21 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 21 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 21 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 21 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 28 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 14 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 28 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 21 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 35 rows containing missing values or values outside the scale range
(`geom_col()`).
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
[[6]]
NULL
[[7]]
NULL
[[8]]
NULL
[[9]]
NULL
[[10]]
NULL
[[11]]
NULL
[[12]]
NULL
[[13]]
NULL
[[14]]
NULL
[[15]]
NULL
[[16]]
NULL
[[17]]
NULL
[[18]]
NULL
[[19]]
NULL
[[20]]
NULL
[[21]]
NULL
[[22]]
NULL
[[23]]
NULL
[[24]]
NULL
[[25]]
NULL
[[26]]
NULL
[[27]]
NULL
[[28]]
NULL
[[29]]
NULL
[[30]]
NULL
[[31]]
NULL
[[32]]
NULL
[[33]]
NULL
[[34]]
NULL
[[35]]
NULL
[[36]]
NULL
[[37]]
NULL
[[38]]
NULL
[[39]]
NULL
[[40]]
NULL
[[41]]
NULL
[[42]]
NULL
[[43]]
NULL
[[44]]
NULL
[[45]]
NULL
[[46]]
NULL
[[47]]
NULL
[[48]]
NULL
[[49]]
NULL
[[50]]
NULL
[[51]]
NULL
[[52]]
NULL
[[53]]
NULL
[[54]]
NULL
[[55]]
NULL
[[56]]
NULL
[[57]]
NULL
[[58]]
NULL
[[59]]
NULL
[[60]]
NULL
[[61]]
NULL
[[62]]
NULL
[[63]]
NULL
[[64]]
NULL
[[65]]
NULL
[[66]]
NULL
[[67]]
NULL
[[68]]
NULL
[[69]]
NULL
[[70]]
NULL
[[71]]
NULL
[[72]]
NULL
[[73]]
NULL
[[74]]
NULL
[[75]]
NULL
[[76]]
NULL
[[77]]
NULL
[[78]]
NULL
[[79]]
NULL
[[80]]
NULL
[[81]]
NULL
[[82]]
NULL
[[83]]
NULL
[[84]]
NULL
[[85]]
NULL
[[86]]
NULL
[[87]]
NULL
[[88]]
NULL
[[89]]
NULL
[[90]]
NULL
[[91]]
NULL
[[92]]
NULL
[[93]]
NULL
[[94]]
NULL
[[95]]
NULL
[[96]]
NULL
[[97]]
NULL
[[98]]
NULL
[[99]]
NULL
[[100]]
NULL
[[101]]
NULL
[[102]]
NULL
[[103]]
NULL
[[104]]
NULL
# Pick one player
player_id <- "ID_93"
# Filter data for that player and weeks
player_data <- weekly_velocity_efforts %>%
filter(anon_id == player_id)
# Plot of Total Sprinting Efforts per Week
ggplot(player_data, aes(x = week, y = total_efforts)) +
geom_line(color = "#CFB87C", size = 1) +
geom_point(color = "#CFB87C", size = 2) +
labs(
title = paste("Total Sprinting Efforts per Week for Player", player_id),
x = "Week",
y = "Total Sprinting Efforts"
) +
theme_classic()
#Plot of Sprint Effort Distribution by Velocity Band
# Reshape absolute counts to long format
player_counts_long <- player_data %>%
select(week, V2, V3, V4, V5, V6, V7, V8) %>%
pivot_longer(
cols = V2:V8,
names_to = "velocity_band",
values_to = "effort_count"
) %>%
mutate(
velocity_band = factor(velocity_band, levels = paste0("V", 2:8))
)
ggplot(player_counts_long, aes(x = week, y = effort_count, fill = velocity_band)) +
geom_col(position = "stack") +
scale_fill_brewer(palette = "Set2") +
labs(
title = paste("Sprint Effort Distribution by Velocity Band for Player", player_id),
x = "Week",
y = "Sprint Effort Count",
fill = "Velocity Band"
) +
theme_classic()
Look at ID_11’s plot from the last question. This player is an offensive lineman. Looking at the plot, he is mostly in band 2 and 3 while never reaching bands 7 or 8 and rarely reaching band 6. Despite this, this athlete’s weekly max velocity is always at least 75% of their all-time max velocity. Only looking at the absolute bands that are provided, we might come to the conclusion that ID_11 is not achieving high running speeds, however after looking at his max velocity efforts relative to his all-time max velocity, he is reaching high running speeds.
Looking at team data as a whole, since January 1, 2024 there is absolute no deviance from 0. That means that since January 1, 2024, the team has had the same average running imbalance of 0. This makes sense given that the team is so large and that imbalances can go from -100% to 100%. This suggests that throughout this time, at no point was there a team sway to one side. There also weren’t any points since January 1, 2024 that the team had any large spikes in average absolute value of running imbalance. This suggests that at no point throughout the season were they largers pikes than normal in running imbalance. Each player tends to have a very unique trend in their running imbalance. Looking at how the team varies but also at how each player varies throughout the season, it’s hard to make out any pattern that’s applicable to most people. The variance of running imbalance varies greatly between each player. Instead we looked at the variances between players who were injured and those who were not. Based on three different bootstrapped findings, we can see that the variances between players who were injured and those who were not were statistically significant. For the first bootstrap, we compared the variance of the pooled groups meaning that the variance in running imbalance for players who were injured and those who were not were compared. This resulted in a 90% confidence interval which suggested that the difference in variance between the two groups is between 0.30 and 3.45. This suggests that when looking at the variance of the two groups separately but all the players are pooled together, the variances will most likely be different by factor between 0.30 and 3.45 and the variance for the injured pool will be greater than that of the uninjured pool. For the second bootstrap, each player’s variance was taken individually. This unpooled approach was taken to see if an individual player’s variance in running imbalance could potentially be related to HSI risk. The bootstrap algorithm in this case took the averaged variances of the bootstrapped sample for each group and compared them. This bootstrap produced a 90% confidence interval for the difference in average variance between the two groups is 0.79 to 1.32. These results suggest that players who sustained a hamstring injury since January 1, 2024 had, on average, a greater variance in their running imbalance by about 1.06. This suggests that there is a relationship between variance in running imbalance and HSI risk. This found increase in variability will be used to address the following questions. For the third bootstrap, we wanted to see if there was a difference in the average mean absolute value in running imbalance between players who were and weren’t injured. The bootstrapping algorithm for this test calculated the average absolute distance value or each running imbalance measurement and found the average for each sample. This test found that at the 90% significance level, injured players had an average running imbalance absolute value between 0.06 and 0.32 greater than their uninjured counterparts. These values though, when we consider that the range of running imbalance goes from 0 to 100 is small and may be hard to detect when out in the field. ### Team Analysis
#team variation
Historical_Running_clean %>%
summarize(Team_Variation = var(Running.Imbalance))
#individual player variation
Historical_Running_clean %>%
group_by(anon_id) %>%
summarize(Player_Variation = var(na.omit(Running.Imbalance))) %>%
ungroup()
Historical_Running_clean %>%
group_by(anon_id) %>%
mutate(Player.var = var(na.omit(Running.Imbalance))) %>%
ungroup() %>%
summarize(Average_Player_Variance = mean(na.omit(Player.var)))
Historical_Running_clean <- Historical_Running_clean %>%
group_by(Date) %>%
mutate(Date.Variance = var(na.omit(Running.Imbalance)),
Date.Avg.Abs.Value = mean(abs(na.omit(Running.Imbalance)))) %>%
ungroup()
#calculating mean and variance for team data
team_mean <- mean(Historical_Running_clean$Running.Imbalance)
team_sd <- sd(Historical_Running_clean$Running.Imbalance)
#making scatter plot of team running imbalance data throughout season
ggplot(Historical_Running_clean, aes(Date, Running.Imbalance)) +
geom_point(alpha = 0.3) +
geom_hline(yintercept = team_mean, color = "#CFB87C") +
geom_hline(yintercept = team_mean + team_sd) +
geom_hline(yintercept = team_mean - team_sd) +
geom_hline(yintercept = team_mean + (2*team_sd), color = "#A2A4A3") +
geom_hline(yintercept = team_mean - (2*team_sd), color = "#A2A4A3") +
geom_smooth(method = "lm", se = TRUE, color = "#CFB87C") +
labs(title = "Team Running Imbalance Since January 1, 2024", y="Running Imbalance (%)",
subtitle = "\u03BC = 0.08623412, \u03C3^2 = 14.94215")
#making scatter plot of team running imbalance data throughout season
ggplot(Historical_Running_clean, aes(Date, Running.Imbalance)) +
geom_point(alpha = 0.3) +
geom_hline(yintercept = team_mean, color = "#CFB87C") +
geom_smooth(method = "lm", se = TRUE, color = "#CFB87C") +
geom_line(aes(x=Date, y=Date.Avg.Abs.Value), color = "#CFB87C") +
geom_line(aes(x=Date, y=-Date.Avg.Abs.Value), color = "#CFB87C") +
labs(title = "Team Running Imbalance Since January 1, 2024", y="Running Imbalance (%)",
subtitle = "\u03BC = 0.08623412, \u03C3^2 = 14.94215")
#making histogram of team running imbalance data
ggplot(Historical_Running_clean, aes(Running.Imbalance)) +
geom_histogram() +
labs(title = "Team Running Imbalance Since January 1, 2024", x="Running Imbalance")
#making histogram for running imbalance of all injured athletes
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,], aes(Running.Imbalance)) +
geom_histogram(fill = "#CFB87C", alpha = 0.75) +
#adding in 95% confidence interval
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.025), color = "#CFB87C") +
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.975), color = "#CFB87C") +
xlim(-21,21) +
labs(title = "Running Imbalance for Players with HSI since January 1, 2024")
#making histogram for running imbalance of all uninjured athletes
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,], aes(Running.Imbalance)) +
geom_histogram(fill = "black", alpha = 0.75) +
#adding in 95% confidence interval
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.025)) +
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.975)) +
xlim(-21,21) +
labs(title = "Running Imbalance for Players without HSI since January 1, 2024")
#Plotting injured and uninjured histograms over top one another
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,], aes(Running.Imbalance)) +
geom_histogram(alpha = 0.75) +
#adding in 95% CI for uninjured players
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.025)) +
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.975)) +
geom_histogram(data = Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,], aes(Running.Imbalance), fill = "#CFB87C", alpha = 0.75) +
#adding in 95% CI for injured players
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.025), color = "#CFB87C") +
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.975), color = "#CFB87C") +
xlim(-21,21) +
labs(title = "Running Imbalance for Players with and without HSI since January 1, 2024")
#making scatter plot of running imbalance data for injured players
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,], aes(Date, Running.Imbalance)) +
geom_point(alpha = 0.3) +
ylim(-21,21) +
labs(title = "Running Imbalance for Players with HSI since January 1, 2024")
#making scatter plot of running imbalance for uninjured players
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,], aes(Date, Running.Imbalance)) +
geom_point(alpha = 0.3) +
ylim(-21,21) +
labs(title = "Running Imbalance for Players without HSI since January 1, 2024")
#Splitting up the data sets and calculating player variance and measurement absolute value
injured_data <- Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,] %>%
mutate(Player.Absolute.Dist = abs(Running.Imbalance)) %>%
group_by(anon_id) %>%
mutate(Player.Variance = var(Running.Imbalance)) %>%
ungroup()
uninjured_data <- Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,] %>%
mutate(Player.Absolute.Dist = abs(Running.Imbalance)) %>%
group_by(anon_id) %>%
mutate(Player.Variance = var(Running.Imbalance)) %>%
ungroup()
#making a data frame to hold all of the within group variances
group_variances <- data.frame(injured_var = rep(NA,5000),
uninjured_var = rep(NA,5000),
diff_in_var = rep(NA, 5000))
#bootstrap for variances, 1000 iterations
for(i in 1:5000){
#random seed
set.seed(i)
#taking samples from each of the data sets, same number of rows, replacement true
injured_sample <- sample_n(injured_data, replace = TRUE, size = 1672)
uninjured_sample <- sample_n(uninjured_data, replace=TRUE, size = 2391)
#storing the calculated variances in data frame
group_variances[i,1] = var(injured_sample$Running.Imbalance)
group_variances[i,2] = var(uninjured_sample$Running.Imbalance)
group_variances[i,3] = group_variances[i,1] - group_variances[i,2]
}
ggplot(data=group_variances, aes(diff_in_var)) +
geom_histogram() +
geom_vline(xintercept = quantile(group_variances$diff_in_var, 0.05), color= "#CFB87C") +
geom_vline(xintercept = quantile(group_variances$diff_in_var, 0.95), color= "#CFB87C") +
labs(x="Difference in Variance")
ggplot(data=group_variances) +
geom_histogram(aes(injured_var), alpha = 0.75, fill ="#CFB87C") +
geom_histogram(aes(uninjured_var), alpha = 0.75) +
labs(x="Variance")
#making a data frame to hold all of the average player variances between groups
mean_player_variances <- data.frame(injured_var = rep(NA,5000),
uninjured_var = rep(NA,5000),
diff_in_var = rep(NA, 5000))
for(i in 1:5000){
#random seed
set.seed(i)
#taking samples from each of the data sets, same number of rows, replacement true
injured_sample <- sample_n(injured_data, replace = TRUE, size = 1672)
uninjured_sample <- sample_n(uninjured_data, replace=TRUE, size = 2391)
#storing the calculated variances in data frame
mean_player_variances[i,1] = mean(na.omit(injured_sample$Player.Variance))
mean_player_variances[i,2] = mean(na.omit(uninjured_sample$Player.Variance))
mean_player_variances[i,3] = mean_player_variances[i,1] - mean_player_variances[i,2]
}
ggplot(data = mean_player_variances, aes(diff_in_var)) +
geom_histogram() +
geom_vline(xintercept = quantile(mean_player_variances$diff_in_var, 0.05), color= "#CFB87C") +
geom_vline(xintercept = quantile(mean_player_variances$diff_in_var, 0.95), color= "#CFB87C") +
labs(x="Difference in Variance")
ggplot(data=mean_player_variances) +
geom_histogram(aes(injured_var), alpha = 0.75, fill ="#CFB87C") +
geom_histogram(aes(uninjured_var), alpha = 0.75) +
labs(x="Average Variance")
#making a data frame to hold all of the average absolute differences from 0
group_distance <- data.frame(injured_dist = rep(NA,5000),
uninjured_dist = rep(NA,5000),
diff_in_dist = rep(NA, 5000))
#bootstrap for variances, 1000 iterations
for(i in 1:5000){
#random seed
set.seed(i)
#taking samples from each of the data sets, same number of rows, replacement true
injured_sample <- sample_n(injured_data, replace = TRUE, size = 1658)
uninjured_sample <- sample_n(uninjured_data, replace=TRUE, size = 2405)
#storing the calculated variances in data frame
group_distance[i,1] = mean(injured_sample$Player.Absolute.Dist)
group_distance[i,2] = mean(uninjured_sample$Player.Absolute.Dist)
group_distance[i,3] = group_distance[i,1] - group_distance[i,2]
}
ggplot(data = group_distance, aes(diff_in_dist)) +
geom_histogram() +
geom_vline(xintercept = quantile(group_distance$diff_in_dist, 0.05), color= "#CFB87C") +
geom_vline(xintercept = quantile(group_distance$diff_in_dist, 0.95), color= "#CFB87C") +
labs(x="Difference in Distance")
ggplot(data = group_distance) +
geom_histogram(aes(injured_dist), alpha = 0.75, fill ="#CFB87C") +
geom_histogram(aes(uninjured_dist), alpha = 0.75) +
labs(x="Average Absolute Value")
#removing junk that came from the loops
remove(i,team_mean, team_sd, injured_sample, uninjured_sample, group_variances, injured_data, uninjured_data, mean_player_variances, group_distance)
COMBO <- c("QB","LB","TE","RB", "ILB")
BIG <- c("OL", "DL", "DE", "DT")
SKILL <- c("WR", "DB", "CB", "SAF")
Incident_Report_clean <- Incident_Report_clean %>%
mutate(Specific.Position = Position,
Position = case_when(Position %in% COMBO ~ "COMBO",
Position %in% BIG ~ "BIG",
Position %in% SKILL ~ "SKILL"))
Based on the analysis below, there doesn’t seem to be any major discernible differences in running imbalance before or following an injury. This suggests that there may not be a direct link between HSI risk and running imbalance value directly beforehand. Instead, the trends of many players who were injured seems to have a relatively consistent trend not entirely dependent on time. Looking at summary statistics of running imbalance in the weeks leading up to and following a hamstring injury, there are also no glaring trends. For this analysis, we looked at the mean and variance in running imbalance per week leading up to and after an injury for all of the injured players with running imbalance data. This showed us that there is no clear indicator of HSI risk in running imbalance or any summary statistic of it. Instead it may be more useful to look at each player’s total running imbalance and their individual variance. This seems to be more of a useful tool for differentiating between injured and uninjured athletes.
#getting running imbalances for just injured players
Injured_Historical_Running <- Historical_Running_clean %>%
filter(anon_id %in% injured_IDs)
#making new column to represent when in time injury would be, negative means before injury and positive means after injury, 0 means date of injury if there's data for that day
Injured_Historical_Running$Weeks.After.Injury <- rep(NA, 1658)
Injured_Historical_Running$Injury.Count <- rep(NA, 1658)
#making new column in incident report for the injury count
Incident_Report_clean$Injury.Count <- rep(NA, 122)
#go through all of the injured players in the data set
for(i in 1:22){
#get the dates each player was injured
injury_dates <- unique(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)
#go through all of the dates in which the player had an injury
for(j in 1:length(injury_dates)){
#calculate dates for 1, 2, 3, 4 weeks before and after each injury date
past_1 <- injury_dates[j]-7
past_2 <- injury_dates[j]-14
past_3 <- injury_dates[j]-21
past_4 <- injury_dates[j]-28
future_1 <- injury_dates[j]+7
future_2 <- injury_dates[j]+14
future_3 <- injury_dates[j]+21
future_4 <- injury_dates[j]+28
#Calculating how many injuries this is for the player
injury_count <- as.character((length(injury_dates)) - j + 1)
#compare date of data point for each player to date of injury, store in Weeks.After.Injury column, store injury count
#first week after injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > injury_dates[j] &
Injured_Historical_Running$Date<=future_1,]$Weeks.After.Injury <- "1"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > injury_dates[j] &
Injured_Historical_Running$Date<=future_1,]$Injury.Count <- injury_count
#second week after injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > future_1 &
Injured_Historical_Running$Date<=future_2,]$Weeks.After.Injury <- "2"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > future_1 &
Injured_Historical_Running$Date<=future_2,]$Injury.Count <- injury_count
#third week after injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > future_2 &
Injured_Historical_Running$Date<=future_3,]$Weeks.After.Injury <- "3"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > future_2 &
Injured_Historical_Running$Date<=future_3,]$Injury.Count <- injury_count
#fourth week after injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > future_3 &
Injured_Historical_Running$Date<=future_4,]$Weeks.After.Injury <- "4"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > future_3 &
Injured_Historical_Running$Date<=future_4,]$Injury.Count <- injury_count
#week right before injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < injury_dates[j] &
Injured_Historical_Running$Date>=past_1,]$Weeks.After.Injury <- "-1"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < injury_dates[j] &
Injured_Historical_Running$Date>=past_1,]$Injury.Count <- injury_count
#two weeks before injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < past_1 &
Injured_Historical_Running$Date>=past_2,]$Weeks.After.Injury <- "-2"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < past_1 &
Injured_Historical_Running$Date>=past_2,]$Injury.Count <- injury_count
#three weeks before injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < past_2 &
Injured_Historical_Running$Date>=past_3,]$Weeks.After.Injury <- "-3"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < past_2 &
Injured_Historical_Running$Date>=past_3,]$Injury.Count <- injury_count
#four weeks before injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < past_3 &
Injured_Historical_Running$Date>=past_4,]$Weeks.After.Injury <- "-4"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < past_3 &
Injured_Historical_Running$Date>=past_4,]$Injury.Count <- injury_count
#Date of Injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date == injury_dates[j],]$Weeks.After.Injury <- "0"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date == injury_dates[j],]$Injury.Count <- injury_count
#adding injury count to indicent report
Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i] & Incident_Report_clean$Date.of.Injury==injury_dates[j],]$Injury.Count <- injury_count
}
}
#making weeks after injury and injury count into a factor and combining data sets back together
Injured_Historical_Running <- Injured_Historical_Running %>%
mutate(Weeks.After.Injury = factor(Weeks.After.Injury),
Injury.Count = factor(Injury.Count))
Historical_Running_clean <- left_join(Historical_Running_clean, Injured_Historical_Running)
#getting rid of junk that was from the loop
remove(future_1, future_2, future_3, future_4, i, injury_count, injury_dates, j, past_1, past_2, past_3, past_4)
remove(Injured_Historical_Running)
for(i in 1:22){
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id==injured_IDs[i],], aes(Date, Running.Imbalance)) +
geom_line() +
geom_point(aes(color=Weeks.After.Injury)) +
geom_vline(xintercept = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury) +
scale_color_manual(values = c("-4"="green", "-3"="yellow", "-2"="orange", "-1"="red", "0"="black","1"= "purple", "2"="navy", "3"="blue", "4"="skyblue")) +
theme_minimal() +
labs(title="Running Imbalance", subtitle = injured_IDs[i])
print(p)
}
#making summary statistics of running imbalance per week relative to injury
Historical_Running_clean <- Historical_Running_clean %>%
group_by(anon_id, Injury.Count, Weeks.After.Injury) %>%
mutate(Weeks.After.Injury.Variability = var(Running.Imbalance),
Weeks.After.Injury.Mean = mean(abs(Running.Imbalance))) %>%
ungroup()
for(i in 1:22){
#looking at mean running imbalance per week before and after injury for each injured player
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],], aes(Date, Weeks.After.Injury.Mean, group=Injury.Count)) +
geom_line() +
geom_point(aes(color=Weeks.After.Injury)) +
xlim(min(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)-30, max(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)+30) +
geom_vline(xintercept = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury) +
labs(title="Average Absolute Distance per Week",injured_IDs[i]) +
scale_color_manual(values = c("-4"="green", "-3"="yellow", "-2"="orange", "-1"="red", "0"="black","1"= "purple", "2"="navy", "3"="blue", "4"="skyblue"))
print(p)
}
for(i in 1:22){
#looking at variance in running imbalance per week before and after injury for each injured player
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],], aes(Date, Weeks.After.Injury.Variability, group=Injury.Count)) +
geom_line() +
geom_point(aes(color=Weeks.After.Injury)) +
xlim(min(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)-30, max(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)+30) +
geom_vline(xintercept = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury) +
labs(title="Variance in Running Imbalance per Week", subtitle=injured_IDs[i]) +
scale_color_manual(values = c("-4"="green", "-3"="yellow", "-2"="orange", "-1"="red", "0"="black","1"= "purple", "2"="navy", "3"="blue", "4"="skyblue"))
print(p)
}
for(i in 1:71){
#only plotting if there are over 15 data points for each player
if(nrow(Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],])>15){
#calculating how strong of a non linear trend there is using a gam
df <- summary(gam(Running.Imbalance~s(Days.Since.Start),
data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]))$edf
#calculating how strong of a linear trend there is using kendall correlation
Kendall_Cor <- cor(x=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]$Days.Since.Start, y=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]$Running.Imbalance, method="kendall")
#if there is a detectable linear relationship in the data
if(abs(Kendall_Cor>0.2) & df<=3){ #linear trend in data
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],], aes(Date, Running.Imbalance)) +
geom_line() +
geom_point() +
geom_smooth(color= "red",
method="lm",
se=FALSE) +
labs(title="Running Imbalance Since January 1, 2024 (linear trend)", subtitle=all_IDs[i],) +
xlim(as.Date("2025-01-01", "%Y-%m-%d"), as.Date("2025-05-01", "%Y-%m-%d"))
print(p)
}
#if there is a detectable non-linear trend in the data
if(df > 3){ #non linear trend in data
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],], aes(Date, Running.Imbalance)) +
geom_line() +
geom_point() +
geom_smooth(color = "skyblue",
se=FALSE) +
labs(title="Running Imbalance Since January 1, 2024 (non-linear trend)", subtitle=all_IDs[i],) +
xlim(as.Date("2025-01-01", "%Y-%m-%d"), as.Date("2025-05-01", "%Y-%m-%d"))
print(p)
}
#if there is no detectable trend in the data
if(df<=3 & abs(Kendall_Cor)<=0.2){
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],], aes(Date, Running.Imbalance)) +
geom_line() +
geom_point() +
labs(title="Running Imbalance Since January 1, 2024 (non-linear trend)", subtitle=all_IDs[i],) +
xlim(as.Date("2025-01-01", "%Y-%m-%d"), as.Date("2025-05-01", "%Y-%m-%d"))
}
}
}
#plotted only from January 1, 2025 to May 1, 2025
remove(i, p, df, Kendall_Cor)
Based on the analysis below, we can see that by solely using variance in running imbalance from the time of the injury to the end of the predicted return-to-play range is not a very strong predictor for whether or not it will take longer for a player to recover or not. In this analysis, we used the running imbalance in the time frame starting with the injury date to the end of the prognosis time frame. With these specific running imbalance values, we calculated the variance in running imbalance and whether or not the athlete returned to play within the predicted time frame or not. This analysis found that when using the variance in these time frames as the only predictor in a logistic regression model, the slope coefficient associated with the variance was statistically significant at the \(\alpha = 0.01\) significance level. With that though, the accuracy of the model was only around 0.6 suggesting that it wasn’t super strong in practice. In order to understand the impact that variance in running imbalance had on whether or not a player returned in the predicted time frame or not, we performed a bootstrap. We separated the observations in which players did and did not return in the predicted time frame into two different data sets. We then sampled from each of these two data sets and calculated the average variance for each sample. This was repeated 5000 times. This gave us an estimate of the average variance in running imbalance for players who returned within the predicted time frame and those who did not. This bootstrap revealed that players who did not return within the predicted time frame had a variance greater by roughly 1.2 during their time of recovery than those who returned on time. This tells us that while variance in running imbalance is not directly strong enough to predict whether or not an athlete will return within the predicted time frame, it can be used to supplement prognosis or make adjustments to the prognosis during the time of recovery of a HSI.
#Calculating date back to play in incident report data set
Incident_Report_clean <- Incident_Report_clean %>%
filter(!is.na(Injury.Prognosis))%>%
#calculating how long predicted time loss is based on prognosis
#beginning of predicted range of return
mutate(Expected.Start.Return = as.Date(ifelse(Injury.Prognosis=="No Expected Time Loss",
Date.of.Injury,
ifelse(Injury.Prognosis=="Less than 1 Week",
Date.of.Injury,
ifelse(Injury.Prognosis=="1-4 Weeks",
Date.of.Injury+days(7),
Date.of.Injury+days(28))))),
#end of predicted range of return
Expected.End.Return = as.Date(ifelse(Injury.Prognosis=="No Expected Time Loss",
Date.of.Injury,
ifelse(Injury.Prognosis=="Less than 1 Week",
Date.of.Injury+days(7),
ifelse(Injury.Prognosis=="1-4 Weeks",
Date.of.Injury+days(28),
Date.of.Injury+days(56)))))) %>%
group_by(anon_id, Date.of.Injury) %>%
#calculating actual date cleared to return
mutate(Actual.Return = Date.of.Injury+days(sum(na.omit(Days.in.Status)))) %>%
ungroup()
for(i in 1:22){
#looking at mean running imbalance per week before and after injury for each injured player
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],], aes(Date, Running.Imbalance)) +
geom_line(linewidth=0.1) +
geom_point() +
#Marking when injury occurred with red line
geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury, color="red") +
#Marking actual return date with solid green line
geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Actual.Return, color="green", linetype=1) +
#Marking beginning of predicted return to play range with purple dotted line
geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Expected.Start.Return, color="purple", linetype=3) +
#Marking end of predicted return to play range with purple dotted line
geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Expected.End.Return, color="purple", linetype=3) +
labs(title=injured_IDs[i])
print(p)
}
#looking at running imbalance of only injured players
Injured_Historical_Running <- Historical_Running_clean %>%
filter(anon_id %in% injured_IDs)
#Making binary column if date was in time range of injury prognosis
Injured_Historical_Running$Date.in.Range <- rep(0, 1658)
#go through all of the injured players in the data set
for(i in 1:22){
#get the dates each player was injured
injury_dates <- unique(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)
#go through dates of injury for each player
for(j in 1:length(injury_dates)){
if(injured_IDs[i] == "ID_50"){ #ID_50 does not have enough running imbalance data
break
}
#get the expected date of return for that instance of injury
expected_return <- as.Date(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i] & Incident_Report_clean$Date.of.Injury==injury_dates[j],]$Expected.End.Return[1])
#if the date in running imbalance is between day of injury and last day of prediction, set as 1
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] & Injured_Historical_Running$Date >= injury_dates[j] & Injured_Historical_Running$Date <= expected_return,]$Date.in.Range <- 1
}
}
#making a column for the variance in running imbalance for each injury instance range
Injured_Historical_Running <- Injured_Historical_Running %>%
filter(Date.in.Range == 1) %>%
group_by(anon_id, Injury.Count) %>%
mutate(Injury.Variability = var(Running.Imbalance)) %>%
ungroup()
#adding injury and running data sets together
Injured_Data <- left_join(Injured_Historical_Running, Incident_Report_clean, by=c("anon_id", "Injury.Count"), relationship = "many-to-many") %>%
#making new column if player returned in predicted time frame
mutate(Return.in.Range = ifelse(Actual.Return>=Expected.Start.Return & Actual.Return <= Expected.End.Return, 1, 0)) %>%
#removing rows that are missing important data
filter(!is.na(Injury.Variability),
!is.na(Return.in.Range))
set.seed(1000)
#making a 75% to 25% training to testing split
rows <- sample(1:nrow(Injured_Data), size=(nrow(Injured_Data)*0.75), replace=FALSE)
Injured_train <- Injured_Data[rows,]
Injured_test <- Injured_Data[-rows,]
#building logistic regression model from training data
return_to_play_model <- glm(Return.in.Range~Injury.Variability, data=Injured_train, family="binomial")
#looking at coefficients and p-values
summary(return_to_play_model)
#making predictions on testing data based off of the model built
Injured_test <- Injured_test %>%
mutate(Prediction = ifelse(predict(return_to_play_model, newdata=Injured_test, type="response")>0.5, 1, 0))
#calculating CER
Injured_test %>%
summarize(CER = mean(Prediction != Return.in.Range))
#making model with all of the data
return_to_play_cv <- glm(Return.in.Range~Injury.Variability, data=Injured_Data, family="binomial")
#making cost function for CER
cost <- function(obs, pred){
mean((pred <= 0.5) & obs==1 | (pred > 0.5) & obs==0)
}
set.seed(1000)
#cross validating with K=10
ten_cv <- cv.glm(data=Injured_Data,glmfit=return_to_play_cv,cost,K=10)
#extract average error
ten_cv$delta[1]
#taking only players who recovered in predicted time frame
In_Range <- Injured_Data %>%
filter(Return.in.Range == 1,
!is.na(Injury.Variability))
#taking only players who did not recover in the predicted time frame
Out_Range <- Injured_Data %>%
filter(Return.in.Range == 0,
!is.na(Injury.Variability))
#making data frame to hold values
Range_Variances <- data.frame(in.avg.var = rep(NA, 5000),
out.avg.var = rep(NA, 5000),
diff.avg.var = rep(NA, 5000))
#bootstrapping 5000 times
for(i in 1:5000){
set.seed(i)
#sample from players who recovered in predicted range
in_sample <- sample_n(In_Range, size=308, replace=TRUE)
#sample fro players who did not recover in predicted range
out_sample <- sample_n(Out_Range, size=477, replace=TRUE)
#calculating variances of each sample and storing in data frame
Range_Variances[i,1] <- mean(in_sample$Injury.Variability)
Range_Variances[i,2] <- mean(out_sample$Injury.Variability)
Range_Variances[i,3] <- Range_Variances[i,2] - Range_Variances[i,1] #diff in variances
}
#plotting differences in average variance
ggplot(data=Range_Variances, aes(diff.avg.var)) +
geom_histogram() +
#adding in 90% CI (does not include 0)
geom_vline(xintercept = quantile(Range_Variances$diff.avg.var, 0.05), color ="#CFB87C") +
geom_vline(xintercept = quantile(Range_Variances$diff.avg.var, 0.95), color ="#CFB87C")
#plotting average variances of difference groups
ggplot(data=Range_Variances) +
#players who recovered in predicted time frame
geom_histogram(aes(in.avg.var), alpha = 0.75) +
#players who did not recover in the predicted time frame
geom_histogram(aes(out.avg.var), alpha = 0.75, fill ="#CFB87C") +
labs(x="Average Variance")
remove(p,i,j,rows,ten_cv, cost, Injured_Data, Injured_Historical_Running, Injured_test, Injured_train, Range_Variances, return_to_play_cv, return_to_play_model, expected_return, injury_dates, In_Range, Out_Range, in_sample, out_sample)